Exercise 1: MLE and MAP (10 points)

Exercise 2: Linear regression with greta (40 points)

Load and preprocess the data (2 points)

avocado_data <- read_csv(url('https://raw.githubusercontent.com/michael-franke/intro-data-analysis/master/data_sets/avocado.csv')) %>% 
  # remove currently irrelevant columns
  select( -X1 , - contains("Bags"), - year, - region) %>% 
  # rename variables of interest for convenience
  rename(
    total_volume_sold = `Total Volume`,
    average_price = `AveragePrice`,
    small  = '4046',
    medium = '4225',
    large  = '4770',
  )

Plot the data (4 points)

avocado_data %>%
  ggplot(aes(x = log(total_volume_sold), y = average_price)) +
  geom_point(color = "darkgray", alpha = 0.3) +
  geom_smooth(color = "firebrick", method = "lm")

Find the MLE (6 points)

# function for the negative log-likelihood of the given
# data and fixed parameter values
nll = function(y, x, beta_0, beta_1, sd) {
  # negative sigma is logically impossible
  if (sd <= 0) {return( Inf )}
  # predicted values
  yPred = beta_0 + x * beta_1
  # negative log-likelihood of each data point 
  nll = -dnorm(y, mean=yPred, sd=sd, log = T)
  # sum over all observations
  sum(nll)
}
fit_lh = optim(
  # initial parameter values
  par = c(1.5, 0, 0.5),
  # function to optimize
  fn = function(par) {
    with(avocado_data, 
         nll(average_price, log(total_volume_sold),
             par[1], par[2], par[3])
    )
  }
)

tibble(parameter = c("intercept", "slope", "standard deviation"), value = fit_lh$par)
## # A tibble: 3 x 2
##   parameter           value
##   <chr>               <dbl>
## 1 intercept           2.57 
## 2 slope              -0.102
## 3 standard deviation  0.327

Plot a second regression line (6 points)

avocado_data %>%
  ggplot(aes(x = log(total_volume_sold), y = average_price)) +
  geom_point(color = "darkgray", alpha = 0.3) +
  geom_smooth(color = "firebrick", method = "lm") +
  geom_abline(aes(slope = fit_lh$par[2], intercept = fit_lh$par[1]))

Implement the model with greta (6 points)

# select data to use
price     <- as_data(avocado_data$average_price)
log_sold  <- as_data(log(avocado_data$total_volume_sold))
# latent variables and priors
intercept <- student(df= 1, mu = 0, sigma = 10) # df = degree of freedom
slope     <- student(df= 1, mu = 0, sigma = 10)
sigma     <- normal(0 , 5, truncation = c(0, Inf))
# derived latent variable (linear model)
mean <- intercept + slope * log_sold
# likelihood 
price <- normal(mean, sigma)
# finalize model, register which parameters to monitor
m <- model(intercept, slope, sigma)

Obtain samples (4 points)

draws <- greta::mcmc(
  model = m, 
  n_samples = 2000,
  warmup = 1000,
  chains = 4
)

Wrangle, summarize and interpret (6 points)

# cast results (type 'mcmc.list') into tidy tibble
tidy_draws <- ggmcmc::ggs(draws)
tidy_draws
## # A tibble: 24,000 x 4
##    Iteration Chain Parameter    value
##        <int> <int> <fct>        <dbl>
##  1         1     1 intercept -0.00875
##  2         2     1 intercept -0.00883
##  3         3     1 intercept -0.00907
##  4         4     1 intercept -0.00860
##  5         5     1 intercept -0.0102 
##  6         6     1 intercept -0.00985
##  7         7     1 intercept -0.00932
##  8         8     1 intercept -0.00972
##  9         9     1 intercept -0.00908
## 10        10     1 intercept -0.00870
## # … with 23,990 more rows
# obtain Bayesian point and interval estimates
Bayes_estimates <- tidy_draws %>% 
  group_by(Parameter) %>%
  summarise(
    '|95%' = HDInterval::hdi(value)[1],
    mean = mean(value),
    '95|%' = HDInterval::hdi(value)[2]
  )
Bayes_estimates
## # A tibble: 3 x 4
##   Parameter   `|95%`      mean  `95|%`
##   <fct>        <dbl>     <dbl>   <dbl>
## 1 intercept -0.0186  -0.00285  0.0123 
## 2 sigma      0.0742   0.0776   0.0816 
## 3 slope     -0.00121  0.000106 0.00139

We infer from the posterior estimates of the slope parameter that there is no or a marginal linear relationship between (log of) volume sold and average price (since the slope lies around zero for the point valued estimates as well as the 95% credible interval).

Point estimates under improper priors in greta (6 points)

# select data to use
price     <- as_data(avocado_data$average_price)
log_sold  <- as_data(log(avocado_data$total_volume_sold))
# latent variables and improper priors
intercept <- variable()
slope     <- variable()
sigma     <- variable(lower = 0)
# derived latent variable (linear model)
mean <- intercept + slope * log_sold
# likelihood 
price <- normal(mean, sigma)
# finalize model, register which parameters to monitor
m <- model(intercept, slope, sigma)

# greta::opt(model = m)
opt_ex2 <- readRDS('/Users/florian/Documents/Studium/Semester_3/Statistics and Data analysis/opt_ex2.rds')
opt_ex2
## $par
## $par$intercept
## [1] 2.565098
## 
## $par$slope
## [1] -0.1024293
## 
## $par$sigma
## [1] 0.3270453
## 
## 
## $value
## [1] 5497.596
## 
## $iterations
## [1] 15
## 
## $convergence
## [1] 0

The output values returned by the greta optimizer are the best fitting parameters for our model (intercept = 2.565, slope = -0.102, standard deviation = 0.327). We estimated these values earlier: they are the parameters which maximize the likelihood function of our gaussian distributed model.

Exercise 3: Analyzing the King of France (22 points)

Get the data (6 points)

data_KoF_cleaned <- read_csv(url('https://raw.githubusercontent.com/michael-franke/intro-data-analysis/master/data_sets/king-of-france_data_cleaned.csv'))

# k0 and N0 for condition 0
k0 <- data_KoF_cleaned %>% 
  filter(condition == "Condition 0") %>% 
  group_by(condition) %>% 
  filter(response == "TRUE")  %>%
  nrow()
k0
## [1] 7
N0 <- data_KoF_cleaned %>% filter(condition == "Condition 0") %>% nrow()
N0
## [1] 75
# k1 and N1 for condition 1
k1 <- data_KoF_cleaned %>% 
  filter(condition == "Condition 1") %>% 
  group_by(condition) %>% 
  filter(response == "TRUE")  %>%
  nrow()
k1
## [1] 3
N1 <- data_KoF_cleaned %>% filter(condition == "Condition 1") %>% nrow()
N1
## [1] 79

Define a greta model to compare latent biases (10 points)

# declare as greta data arrays
y0 <- as_data(k0)
y1 <- as_data(k1)
# priors 
theta_0 <- beta(1, 1)
theta_1 <- beta(1, 1)
# derived prameters
delta <- abs(theta_0 - theta_1)
# likelihood
distribution(y0) <- binomial(N0, theta_0)
distribution(y1) <- binomial(N1, theta_1)
# model 
m <- model(theta_0, theta_1, delta)

Sample and interpret (6 points)

draws <- greta::mcmc(m, warmup = 1000, n_samples = 2000)
tidy_draws <- ggmcmc::ggs(draws)
tidy_draws
## # A tibble: 24,000 x 4
##    Iteration Chain Parameter   value
##        <int> <int> <fct>       <dbl>
##  1         1     1 delta     0.0129 
##  2         2     1 delta     0.00809
##  3         3     1 delta     0.0182 
##  4         4     1 delta     0.00598
##  5         5     1 delta     0.00596
##  6         6     1 delta     0.00134
##  7         7     1 delta     0.0142 
##  8         8     1 delta     0.0823 
##  9         9     1 delta     0.134  
## 10        10     1 delta     0.0992 
## # … with 23,990 more rows
Bayes_estimates <- tidy_draws %>% 
  group_by(Parameter) %>%
  summarise(
    '|95%' = HDInterval::hdi(value)[1],
    mean = mean(value),
    '95|%' = HDInterval::hdi(value)[2]
  )
Bayes_estimates
## # A tibble: 3 x 4
##   Parameter    `|95%`   mean `95|%`
##   <fct>         <dbl>  <dbl>  <dbl>
## 1 delta     0.0000667 0.0574 0.124 
## 2 theta_0   0.0390    0.103  0.170 
## 3 theta_1   0.0102    0.0494 0.0958

The bayesian estimates of parameter delta suggest that one is more likely to respond “true” for false implicit presuppositions (condition 0) then for false explicit assertions (condition 1). The upper bound of the estimated difference between the biases for responding with “true” lies around 13% and is about 6% on average.